Libraries
library(tidyverse)
library(readr) # import
library(rpart) # regression trees
library(rpart.plot) # regression tree plots
library(summarytools) # summary statistics
library(party) # ctree
library(partykit) # ctree
library(caret)
library(forecast)
library(ineq) # Gini
library(precrec) # ROC curves
library(corrplot) # Correlation plots
library(plotly) # interactive ggplot2 plots :D
Data import
TEXT
# setting the data path
data_path ="./AT2019"
# accessing the data
data19 <- read.csv(file.path(data_path, "p_silc2019_ext.csv"), sep = ";") # personal data
Data Wrangling
First, we will select and rename the variables of interest. The we will log net income and recode some of the variables.
data19 <- data19 %>%
select(sex, P038004, P110000nu, P111010nu, alter, M009010, M010000, M014000, M016000, M017000, M020010, M021000, M025000, M027000, M028000, M004000, M001300, M001510, M003100, M001100, M001200, M002000, M001500) %>%
rename("inc_net" = P038004, # gross monthly income
"country_birth" = P110000nu, # country of birth of respondent
"citizenship" = P111010nu, # citizenship of respondent
"age" = alter, # age of respondent
"father_cit" = M009010, # citizenship of father at age 14
"father_edu" = M010000, # education of father at age 14 (höchster abschluss)
"father_occup_stat" = M014000, # occupational status of father at age 14
"father_occup" = M016000, # main occupation of father at age 14
"father_manag" = M017000, # managerial position of father at age 14
"mother_cit" = M020010, # citizenship of mother at age 14
"mother_edu" = M021000, # education of mother at age 14
"mother_occup_stat" = M025000, # occupational status of mother at age 14
"mother_occup" = M027000, # main occupation of mother at age 14
"mother_manag" = M028000, # managerial position of mother at age 14
"tenancy" = M004000, # tenancy at age 14
"children" = M001300, # number of children (under 18) in respondent’s household at age 14
"adults" = M001510, # number of adults (aged 18 or more) in respondent’s household
"adults_working" = M003100, # number of working adults (aged 18 or more) in respondent’s hhd.
"father_present" = M001100, # father present in respondent’s household at age 14
"mother_present" = M001200, # mother present in respondent’s household at age 14
"adults_present" = M001500, # adults present in respondent’s household at age 14
) %>%
filter(age %in% (27:59), inc_net > 0, mother_present > 0, father_present > 0, father_cit > 0, mother_cit > 0) %>%
# We drop all answers where the respondents refused or were not able to provide information
# D: dropped man dann nicht die ganzen observations??
# D und wwenn wir nach age filtern (und nicht die observations mit -6 entfernen) müssen wir erklären warum 27-59 und nicht 25-59 und den text unten ändern, weil ich hab ja einfach die mit -6 (age outside of range) entfernt.
mutate("inc_net_log" = log(inc_net),
# logged net income per month of respondent
"both_parents_present" = father_present + mother_present,
# 4 = none present, 3 = one present, 2 = both present
sex = factor(ifelse(as.numeric(sex)==2, 1, 0)),
# 0 = male, 1 = female
country_birth = factor(country_birth, labels = c(1, 2, 2, 2, 3, 3)),
# Austria, EU, Non-EU
father_cit = ifelse(father_cit == 1, 1, 2),
# Austria and Other
mother_cit = ifelse(mother_cit == 1, 1, 2))
# Austria and Other
Data Exploration and Visualization
To get a good grasp of our data we will first look at some simple descriptives and a correlaton plot The ad-hoc module on intergenerational transmission of disadvantages only includes “selected respondents aged over 24 years and less than 60 years”. This is why we exclude them: coding ‘-6’ means that year of birth is outside of 1969 and 1994 range or the interview was a proxy interview. Additionally, we exclude all respondents that where not part of this ad-hoc modul even if of the desired age.
Data Summary
print(dfSummary(data19), method="render", style="grid", plain.ascii = F)
The data set includes 25 variables and 3600 observation. There are almost no missing values. There are slight more females (52%) than males (48%). The income distribution is skewed to the right meaning the median income is lower than the mean income. 83% of respondents where born in Austria and 88& are Austrian citizens. In about 85% of respondents household at age 14 both parents where present and in 96% the mother was present….
Gini Index and Lorenz curve
ineq(data19$inc_net, type = "Gini")
## [1] 0.2543448
The Gini index is 0.25 which is a bit lower than the World Bank estimate for Austria of 0.3 (2017) available at https://data.worldbank.org/indicator/SI.POV.GINI?locations=AT.
plot(Lc(data19$inc_net), col = "darkred", lwd = 3)
The Gini index corresponds to the are below the the black equal distribution line and above the red line of the actual distribution.
Age pyramide
agepyra <- ggplot(data19, aes(x = age, fill=sex)) +
geom_bar(data = subset(data19, sex==1)) +
geom_bar(data = subset(data19, sex==0), aes(y=..count..*(-1))) +
scale_x_continuous(breaks = seq(27,59,2), labels=abs(seq(27,59,2))) +
scale_fill_manual(name = "Sex", labels = c("Male", "Female"), values=c("springgreen2", "slateblue1")) +
labs(title = "Age pyramide of ad-hoc module on intergenerational transmission of disadvantages", x = "Age", y = "Number of people") +
theme_bw() +
coord_flip()
ggplotly(agepyra)
In the Data Frame Summary above we already saw that there are slightly more females (1) than males (0) in the data set and that the median age is 44 - while the age distribution of the sample is quite evenly distribution there are a bit more older than young individuals.
median(data19$age)
## [1] 44
Correlation plot
data19cor <- data19
data19cor$sex <- as.numeric(data19cor$sex)
data19cor$country_birth <- as.numeric(data19cor$country_birth)
# Dropping the categorical variables father_occup & mother_occup
data19cor <- select(data19cor, -c(father_occup, mother_occup))
# Computing correlation coefficients and significance thereof
data19cor <- cor(data19cor)
res1 <- cor.mtest(data19cor, conf.level = 0.99)
corrplot(data19cor, method = "ellipse", type = "upper", order = "FPC", diag = FALSE, outline = FALSE, tl.cex = .5, tl.col = "black", title = "Correlation plot", p.mat = res1$p, sig.level = 0.01, insig = "blank", mar=c(2,2,2,2))
As can be seen from the correlation plot, all variables are significantly related to at least one other variable of the data set (at the 1% significance level). For better visibility insignificant correlations are blanked out. As the correlation matrix is ordered using the first principal component there is some clustering of significant correlations.
Regression Tree
set.seed(123)
formula = inc_net_log ~ sex + country_birth + father_cit + father_edu + father_occup_stat + father_occup + father_manag + mother_cit + mother_edu + mother_occup_stat + mother_occup + mother_manag + tenancy + children + adults + adults_working + both_parents_present
data19 <- data19 %>%
mutate(train_index = sample(c("train", "test"), nrow(data19), replace=TRUE, prob=c(0.80, 0.20)))
train <- data19 %>% filter(train_index=="train")
test <- data19 %>% filter(train_index=="test")
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 10, savePredictions = T)
tuning_grid <- expand.grid(cp = seq(0, 0.02, by= 0.005))
tuning_grid
## cp
## 1 0.000
## 2 0.005
## 3 0.010
## 4 0.015
## 5 0.020
caret_rpart <- train(formula, data = data19, method = "rpart", trControl = fitControl, tuneGrid = tuning_grid, metric = "RMSE", na.action = na.pass)
caret_rpart
## CART
##
## 3600 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 3241, 3240, 3239, 3240, 3239, 3240, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.000 0.5160590 0.1197761 0.3786742
## 0.005 0.4789493 0.1802058 0.3461936
## 0.010 0.4833582 0.1649875 0.3497187
## 0.015 0.4853702 0.1579280 0.3515222
## 0.020 0.4852788 0.1582175 0.3514775
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.005.
tree_caret_final <- caret_rpart$finalModel
rpart.plot(tree_caret_final, box.palette="RdBu", nn=FALSE, type=2)

Conditional inference tree
# For the Inference Tree to work, we must have all variables as numeric data
Ctree <- ctree(formula, data = train, control = ctree_control(testtype = "Bonferroni")) #I think that this already include the Control for inference trees, there is a possibility to do it with CV and caret, but it did not work out yet
Ctree
##
## Model formula:
## inc_net_log ~ sex + country_birth + father_cit + father_edu +
## father_occup_stat + father_occup + father_manag + mother_cit +
## mother_edu + mother_occup_stat + mother_occup + mother_manag +
## tenancy + children + adults + adults_working + both_parents_present
##
## Fitted party:
## [1] root
## | [2] sex in 0
## | | [3] country_birth in 1, 2
## | | | [4] mother_cit <= 1
## | | | | [5] father_edu <= 2
## | | | | | [6] mother_edu <= -2: 7.288 (n = 9, err = 6.5)
## | | | | | [7] mother_edu > -2: 7.696 (n = 777, err = 100.1)
## | | | | [8] father_edu > 2: 7.780 (n = 365, err = 73.9)
## | | | [9] mother_cit > 1
## | | | | [10] father_edu <= 6
## | | | | | [11] adults_working <= 0: 7.092 (n = 7, err = 1.7)
## | | | | | [12] adults_working > 0: 7.540 (n = 116, err = 9.9)
## | | | | [13] father_edu > 6: 7.754 (n = 46, err = 12.8)
## | | [14] country_birth in 3: 7.313 (n = 71, err = 26.7)
## | [15] sex in 1
## | | [16] mother_edu <= 2
## | | | [17] father_cit <= 1: 7.259 (n = 897, err = 239.5)
## | | | [18] father_cit > 1: 7.089 (n = 191, err = 58.9)
## | | [19] mother_edu > 2: 7.376 (n = 398, err = 132.1)
##
## Number of inner nodes: 9
## Number of terminal nodes: 10
plot(Ctree, type = "simple",gp = gpar(fontsize = 6),
inner_panel=node_inner,
ip_args=list(id = FALSE), main = "Conditional Inference Tree for Austria 2019") #Überschrift größe ändern

Instead of the default tuning function we use the caret package for cross validation.
### data = in data 2019 geändert von train!
caret_ctree <- train(formula, data = data19, method = "ctree", trControl = fitControl, na.action = na.pass)
caret_ctree
## Conditional Inference Tree
##
## 3600 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 3240, 3241, 3240, 3240, 3240, 3240, ...
## Resampling results across tuning parameters:
##
## mincriterion RMSE Rsquared MAE
## 0.01 0.4923692 0.1519835 0.3570260
## 0.50 0.4799440 0.1780711 0.3453309
## 0.99 0.4783652 0.1817853 0.3442732
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mincriterion = 0.99.
caret_ctree_B <- ctree(formula, data = data19, control = ctree_control(testtype = "Bonferroni", mincriterion = 0.99))
caret_ctree_B
##
## Model formula:
## inc_net_log ~ sex + country_birth + father_cit + father_edu +
## father_occup_stat + father_occup + father_manag + mother_cit +
## mother_edu + mother_occup_stat + mother_occup + mother_manag +
## tenancy + children + adults + adults_working + both_parents_present
##
## Fitted party:
## [1] root
## | [2] sex in 0
## | | [3] country_birth in 1, 2
## | | | [4] mother_cit <= 1
## | | | | [5] mother_occup_stat <= -2: 7.306 (n = 15, err = 7.1)
## | | | | [6] mother_occup_stat > -2: 7.725 (n = 1416, err = 209.5)
## | | | [7] mother_cit > 1
## | | | | [8] father_edu <= 6: 7.518 (n = 152, err = 21.4)
## | | | | [9] father_edu > 6: 7.820 (n = 54, err = 15.7)
## | | [10] country_birth in 3: 7.326 (n = 92, err = 32.6)
## | [11] sex in 1
## | | [12] father_edu <= 2
## | | | [13] father_cit <= 1: 7.253 (n = 1055, err = 290.7)
## | | | [14] father_cit > 1: 7.036 (n = 204, err = 64.0)
## | | [15] father_edu > 2: 7.372 (n = 612, err = 169.4)
##
## Number of inner nodes: 7
## Number of terminal nodes: 8
plot(caret_ctree_B,gp = gpar(fontsize = 6),
inner_panel=node_inner,
ip_args=list(abbreviate = FALSE,id = FALSE), main = "Opportunity Conditional Inference Tree for Austria 2019 - Cross Validated")

# Das ist der gleiche Baum wie oben, nur mit anderem Package erstellt. Sieht aber scheiße aus
plot(caret_ctree,gp = gpar(fontsize = 6),
inner_panel=node_inner,
ip_args=list(id = FALSE))

plot(caret_ctree$finalModel, type = "simple")

Graphic representation of the tuning parameters
plot(caret_ctree) # RMSE vs p-value our resampling parameter

plot(caret_rpart)

# plotcp(tree_1)
Predictions
test$P_AtCt <- predict(Ctree, newdata = as.data.frame(test))
test$perror <- (test$P_AtCt - test$inc_net_log)^2
test$RMSE <- sqrt(sum((test$P_AtCt - test$inc_net_log)^2/nrow(test), na.rm = T))
head(test$RMSE)
## [1] 0.459181 0.459181 0.459181 0.459181 0.459181 0.459181
plot(test$P_AtCt, test$inc_net_log) #ADD GGPLOT und machs schön!

test$P_AtCt_caret <- predict(caret_rpart, newdata = as.data.frame(test))
test$perror_caret <- (test$P_AtCt_caret - test$inc_net_log)^2
test$RMSE_caret <- sqrt(sum((test$P_AtCt_caret - test$inc_net_log)^2/nrow(test), na.rm = T))
head(test$RMSE_caret)
## [1] 0.4566333 0.4566333 0.4566333 0.4566333 0.4566333 0.4566333
Random Forest
cf <- cforest(formula, data19, na.action = na.pass, control = ctree_control(teststat = "quadratic", testtype = "Bonferroni", mincriterion = 0.99), ytrafo = NULL, scores = NULL, ntree = 500L, perturb = list(replace = FALSE, fraction = 0.8))
hat_cf <- predict(cf, newdata = test, OOB = TRUE, type = "response")
# Calculate the RMSE by hand for cforest and boosted ctree to compare
varimp(cf, mincriterion = 0, OOB = TRUE)
## sex country_birth father_cit
## 0.0677898397 0.0059506319 0.0056707554
## father_edu father_occup mother_cit
## 0.0054289985 0.0028044436 0.0065297354
## mother_edu mother_occup_stat mother_occup
## 0.0040748970 0.0009257607 0.0004644832
## mother_manag tenancy children
## 0.0017892779 -0.0001953003 0.0004404371
## adults adults_working both_parents_present
## -0.0006739003 0.0003050525 0.0005388467
importance_cf <- data.frame(varimp(cf, mincriterion = 0, OOB = TRUE))
names(importance_cf) <- "importance"
importance_cf$var_name = rownames(importance_cf)
importance_cf <- importance_cf %>% arrange( desc(importance))
varimpo <- ggplot(importance_cf, aes(x = var_name, y = importance)) +
geom_pointrange(shape = 21, colour = "black", fill = "white", size = 3, stroke = 1, aes(ymin = 0, ymax = importance)) +
scale_x_discrete(limits = importance_cf$var_name[order(importance_cf$importance)]) +
labs(title = "Conditional Forest variable importance - Austria 2019", x = "", y = "Mean decrease in sum of squared residuals") +
coord_flip() +
theme_light() +
theme(axis.line = element_blank(), panel.border = element_blank(), panel.grid.major.y=element_blank())
ggplotly(varimpo)
We find that the variable sex, is the single most important variable in determining ones income in Austria. However, sex is not a generationally transmittable circumstance and, while it is a circumstance it is not exactly what we were trying to answer with our exercise. Therefore, we exclude it in the next step and create a new conditional inference forest.
Boosted Inference Tree
# cf_boosted <- blackboost(formula, data = data19, na.action = na.pass, control = boost_control(), tree_controls = partykit::ctree_control())
# cf_boosted
#
cf_boosted_2 <- train(formula, data19, method = "ctree2", trControl = fitControl, tuneGrid = NULL, na.action = na.pass)
cf_boosted_2
## Conditional Inference Tree
##
## 3600 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 3240, 3240, 3240, 3241, 3240, 3240, ...
## Resampling results across tuning parameters:
##
## maxdepth mincriterion RMSE Rsquared MAE
## 1 0.01 0.4851689 0.1581313 0.3514751
## 1 0.50 0.4851689 0.1581313 0.3514751
## 1 0.99 0.4851689 0.1581313 0.3514751
## 2 0.01 0.4810092 0.1726096 0.3479951
## 2 0.50 0.4810092 0.1726096 0.3479951
## 2 0.99 0.4810092 0.1726096 0.3479951
## 3 0.01 0.4796566 0.1774570 0.3457560
## 3 0.50 0.4798489 0.1768308 0.3459463
## 3 0.99 0.4792887 0.1784544 0.3458118
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were maxdepth = 3 and mincriterion = 0.99.
plot(cf_boosted_2$finalModel, type = "simple")

# Calc RMSE!!
Variable Importance
Cross Country Comparison
Introduction
In this part of the seminar paper, we attempt to reproduce the findings of [@brunori20], but unfortunately we do not have access to the actual EU-SILC data from 2011. Instead we reproduce the findings using the synthetic data provided by the European office of statistics (Eurostat) (https://ec.europa.eu/eurostat/web/microdata/statistics-on-income-and-living-conditions).
library(rpart)
library(rpart.plot)
library(caret)
library(party)
library(partykit) #Ctree
library(ggparty)
library(tidyverse)
library(RColorBrewer)
library(knitr)
library(knitcitations); cleanbib()
cite_options(citation_format = "pandoc", check.entries=FALSE)
library(bibtex)
library(readr)
library(summarytools)
Data Wrangling
The original data is not provided as the EU protects the privacy of the original respondents. The idea of the public microdata, is that it allows us to train and write the code using the actual variable names, but not obtaining true results. The EU-SILC public microdata files are fully synthetic and they were simulated using statistical modeling and show the statistical distributions of the original data. The main caveats of this data are, that it cannot be used for statistical inference to the wider population. The results and conclusion obtained from this public microdata are to be taken with a big grain of salt. Luckily, the individual country datasets are grouped in a coherent manner. We use the EU-SILC data from 2011 as it was the survey when additionally there were questions on inter-generational transmission. These were questions about the parents of the respondents. We want to see, whether it is possible using only circumstantial information given about the parents and respondents to predict the income of the respondents.
The unique identifier used in all four data sets is the household ID identifier: RX030 in the Personal Register, PX030 in the Personal Data, DB030 in the Household Register, and HB030 in the Household Data file. We only need to combine two of the datasets, namely the Household Register and the Personal Data. Latter contains the Ad-hoc module with the questions on intergenerational characteristics.
Following [@brunori20] we use the following variables for circumstances: Respondent’s sex (PB150), Respondent’s country of birth (Citizenship as proxy - PB220A), Presence of parents at home (PT010), Number of adults (18 or older) in respondents household (PT020), Number of working adults (18 or older) in respondents household (PT030), Father/Mother country of birth and citizenship (PT060, PT070, PT090, PT100), Father/mother education (PT110, PT120), Father/mother occupational status (PT130, PT160), Father/mother main occupation (PT150,PT180), Managerial position of father/mother(PT140,PT170), Tenancy status of the house in which respondent was living as a child (PT210).
Outcome Variables i.e. Income: Total Household gross income (HY010), Total Disposable Income (HY020), Dwelling Type (HH010), Housing (HH030).
We first use more variables than ultimately used in the analysis. We use the year of birth to calculate the age, and then exclude everyone older than 60 or younger than 27, as was done in the paper we are referring to. We first included both monthly and annual gross income. But in this cross-country analysis we use annual gross income as our outcome variable.
At first we ran the analysis with the citizenship variable included, but we ultimately decided that it is not really a circumstantial variable as Respondents country of birth would have been. Since it is utltimately possible to obtain a new citizenship.
# setting the data path
data_path ="./SILC_2011"
getwd()
## [1] "C:/Users/leofi/Desktop/2187-2087_WS2020_Data_Science-Machine_learning/01_Data Science Seminar Paper"
# accessing the data
AT_personal_data <- read.csv(file.path(data_path, "AT_2011p_EUSILC.csv"))
AT_household_data <- read.csv(file.path(data_path, "AT_2011h_EUSILC.csv"))
# change the name of the identifier variable
AT_household_data <- AT_household_data %>% rename("PX030" = HB030)
# joining the data
AT_equality_data <- AT_personal_data %>% left_join(AT_household_data, by = "PX030")
# Renaming important variables for readability of tree
AT_equality_data <- AT_equality_data %>% select(
PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
age = (2011 - PB140), log_income = log(HY010 + 1)
) %>% filter(
age %in% (27:59)
) %>% mutate(
citizenship = factor(PB220A, labels = c(1,2,3))
) %>%
rename(
"year_of_birth" = PB140,
"annual_income" = HY010,
"sex" = PB150,
"parents_present" = PT010,
"adults_home" = PT020,
"children_home" = PT030,
"father_cob" = PT060,
"father_cit" = PT070,
"mother_cob" = PT090,
"mother_cit" = PT100,
"father_edu" = PT110,
"mother_edu" = PT120,
"father_occup_stat" = PT130,
"mother_occup_stat" = PT160,
"father_occup" = PT150,
"mother_occup" = PT180,
"father_manag" = PT140,
"mother_manag" = PT170,
"tenancy" = PT210,
"monthly_income" = PY200G)
Summary We provide the summary statistics for Austria, which we obtained using the ‘dfsummary’ from the package ‘summarytools’. Similar to the 2019 dataset the ‘AT_equality_data’ does contain almost 7000 observations and no missing entries in our outcome variable annual income. However, it does contain many missing values across the observed circumstances. We chose to not exclude those and deal with these missing entries using the ‘na.action = na.omit’ command when doing the statistical analysis.
print(dfSummary(AT_equality_data), method="render")
# Here we repeat the Data Wrangling steps for other EU Member States
# France
FR_personal_data <- read.csv(file.path(data_path, "FR_2011p_EUSILC.csv"))
FR_household_data <- read.csv(file.path(data_path, "FR_2011h_EUSILC.csv"))
FR_household_data <- FR_household_data %>% rename("PX030" = HB030)
FR_equality_data <- FR_personal_data %>% left_join(FR_household_data, by = "PX030")
FR_equality_data <- FR_equality_data %>% select(
PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
age = (2011 - PB140), log_income = log(HY010 + 1)
) %>% filter(
age %in% (27:59)
) %>% mutate(
citizenship = factor(PB220A, labels = c(1,2,3))
) %>%
rename(
"year_of_birth" = PB140,
"annual_income" = HY010,
"sex" = PB150,
"parents_present" = PT010,
"adults_home" = PT020,
"children_home" = PT030,
"father_cob" = PT060,
"father_cit" = PT070,
"mother_cob" = PT090,
"mother_cit" = PT100,
"father_edu" = PT110,
"mother_edu" = PT120,
"father_occup_stat" = PT130,
"mother_occup_stat" = PT160,
"father_occup" = PT150,
"mother_occup" = PT180,
"father_manag" = PT140,
"mother_manag" = PT170,
"tenancy" = PT210,
"monthly_income" = PY200G)
# Denmark
DK_personal_data <- read.csv(file.path(data_path, "DK_2011p_EUSILC.csv"))
DK_household_data <- read.csv(file.path(data_path, "DK_2011h_EUSILC.csv"))
DK_household_data <- DK_household_data %>% rename("PX030" = HB030)
DK_equality_data <- DK_personal_data %>% left_join(DK_household_data, by = "PX030")
DK_equality_data <- DK_equality_data %>% select(
PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
age = (2011 - PB140), log_income = log(HY010 + 1)
) %>% filter(
age %in% (27:59)
) %>% mutate(
citizenship = factor(PB220A, labels = c(1,2,3))
) %>%
rename(
"year_of_birth" = PB140,
"annual_income" = HY010,
"sex" = PB150,
"parents_present" = PT010,
"adults_home" = PT020,
"children_home" = PT030,
"father_cob" = PT060,
"father_cit" = PT070,
"mother_cob" = PT090,
"mother_cit" = PT100,
"father_edu" = PT110,
"mother_edu" = PT120,
"father_occup_stat" = PT130,
"mother_occup_stat" = PT160,
"father_occup" = PT150,
"mother_occup" = PT180,
"father_manag" = PT140,
"mother_manag" = PT170,
"tenancy" = PT210,
"monthly_income" = PY200G)
# Spain
ES_personal_data <- read.csv(file.path(data_path, "ES_2011p_EUSILC.csv"))
ES_household_data <- read.csv(file.path(data_path, "ES_2011h_EUSILC.csv"))
ES_household_data <- ES_household_data %>% rename("PX030" = HB030)
ES_equality_data <- ES_personal_data %>% left_join(ES_household_data, by = "PX030")
ES_equality_data <- ES_equality_data %>% select(
PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
age = (2011 - PB140), log_income = log(HY010 + 1)
) %>% filter(
age %in% (27:59)
) %>% mutate(
citizenship = factor(PB220A, labels = c(1,2,3))
) %>%
rename(
"year_of_birth" = PB140,
"annual_income" = HY010,
"sex" = PB150,
"parents_present" = PT010,
"adults_home" = PT020,
"children_home" = PT030,
"father_cob" = PT060,
"father_cit" = PT070,
"mother_cob" = PT090,
"mother_cit" = PT100,
"father_edu" = PT110,
"mother_edu" = PT120,
"father_occup_stat" = PT130,
"mother_occup_stat" = PT160,
"father_occup" = PT150,
"mother_occup" = PT180,
"father_manag" = PT140,
"mother_manag" = PT170,
"tenancy" = PT210,
"monthly_income" = PY200G)
# Finland
FI_personal_data <- read.csv(file.path(data_path, "FI_2011p_EUSILC.csv"))
FI_household_data <- read.csv(file.path(data_path, "FI_2011h_EUSILC.csv"))
FI_household_data <- FI_household_data %>% rename("PX030" = HB030)
FI_equality_data <- FI_personal_data %>% left_join(FI_household_data, by = "PX030")
FI_equality_data <- FI_equality_data %>% select(
PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
age = (2011 - PB140), log_income = log(HY010 + 1)
) %>% filter(
age %in% (27:59)
) %>% mutate(
citizenship = factor(PB220A, labels = c(1,2,3))
) %>%
rename(
"year_of_birth" = PB140,
"annual_income" = HY010,
"sex" = PB150,
"parents_present" = PT010,
"adults_home" = PT020,
"children_home" = PT030,
"father_cob" = PT060,
"father_cit" = PT070,
"mother_cob" = PT090,
"mother_cit" = PT100,
"father_edu" = PT110,
"mother_edu" = PT120,
"father_occup_stat" = PT130,
"mother_occup_stat" = PT160,
"father_occup" = PT150,
"mother_occup" = PT180,
"father_manag" = PT140,
"mother_manag" = PT170,
"tenancy" = PT210,
"monthly_income" = PY200G)
# Italy
IT_personal_data <- read.csv(file.path(data_path, "IT_2011p_EUSILC.csv"))
IT_household_data <- read.csv(file.path(data_path, "IT_2011h_EUSILC.csv"))
IT_household_data <- IT_household_data %>% rename("PX030" = HB030)
IT_equality_data <- IT_personal_data %>% left_join(IT_household_data, by = "PX030")
IT_equality_data <- IT_equality_data %>% select(
PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
age = (2011 - PB140), log_income = log(HY010 + 1)
) %>% filter(
age %in% (27:59)
) %>% mutate(
citizenship = factor(PB220A, labels = c(1,2,3))
) %>%
rename(
"year_of_birth" = PB140,
"annual_income" = HY010,
"sex" = PB150,
"parents_present" = PT010,
"adults_home" = PT020,
"children_home" = PT030,
"father_cob" = PT060,
"father_cit" = PT070,
"mother_cob" = PT090,
"mother_cit" = PT100,
"father_edu" = PT110,
"mother_edu" = PT120,
"father_occup_stat" = PT130,
"mother_occup_stat" = PT160,
"father_occup" = PT150,
"mother_occup" = PT180,
"father_manag" = PT140,
"mother_manag" = PT170,
"tenancy" = PT210,
"monthly_income" = PY200G)
# # Bulgaria
# BG_personal_data <- read.csv(file.path(data_path, "BG_2011p_EUSILC.csv"))
# BG_household_data <- read.csv(file.path(data_path, "BG_2011h_EUSILC.csv"))
# BG_household_data <- BG_household_data %>% rename("PX030" = HB030)
# BG_equality_data <- BG_personal_data %>% left_join(BG_household_data, by = "PX030")
#
# BG_equality_data <- BG_equality_data %>% select(
# PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
# age = (2011 - PB140), log_income = log(HY010 + 1)
# ) %>% filter(
# age %in% (27:59)
# ) %>% mutate(
# citizenship = factor(PB220A, labels = c(1,2,3))
# ) %>%
# rename(
# "year_of_birth" = PB140,
# "annual_income" = HY010,
# "sex" = PB150,
# "parents_present" = PT010,
# "adults_home" = PT020,
# "children_home" = PT030,
# "father_cob" = PT060,
# "father_cit" = PT070,
# "mother_cob" = PT090,
# "mother_cit" = PT100,
# "father_edu" = PT110,
# "mother_edu" = PT120,
# "father_occup_stat" = PT130,
# "mother_occup_stat" = PT160,
# "father_occup" = PT150,
# "mother_occup" = PT180,
# "father_manag" = PT140,
# "mother_manag" = PT170,
# "tenancy" = PT210,
# "monthly_income" = PY200G)
# Latvia
LV_personal_data <- read.csv(file.path(data_path, "LV_2011p_EUSILC.csv"))
LV_household_data <- read.csv(file.path(data_path, "LV_2011h_EUSILC.csv"))
LV_household_data <- LV_household_data %>% rename("PX030" = HB030)
LV_equality_data <- LV_personal_data %>% left_join(LV_household_data, by = "PX030")
LV_equality_data <- LV_equality_data %>% select(
PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
age = (2011 - PB140), log_income = log(HY010 + 1)
) %>% filter(
age %in% (27:59)
) %>% mutate(
citizenship = factor(PB220A, labels = c(1,2,3))
) %>%
rename(
"year_of_birth" = PB140,
"annual_income" = HY010,
"sex" = PB150,
"parents_present" = PT010,
"adults_home" = PT020,
"children_home" = PT030,
"father_cob" = PT060,
"father_cit" = PT070,
"mother_cob" = PT090,
"mother_cit" = PT100,
"father_edu" = PT110,
"mother_edu" = PT120,
"father_occup_stat" = PT130,
"mother_occup_stat" = PT160,
"father_occup" = PT150,
"mother_occup" = PT180,
"father_manag" = PT140,
"mother_manag" = PT170,
"tenancy" = PT210,
"monthly_income" = PY200G)
print(dfSummary(FR_equality_data), method="render")
print(dfSummary(DK_equality_data), method="render") #We have maybe too many missing values for Denmark
print(dfSummary(ES_equality_data), method="render")
print(dfSummary(FI_equality_data), method="render") #We have maybe too many missing values for Finland
print(dfSummary(IT_equality_data), method="render")
print(dfSummary(LV_equality_data), method="render")
Method: Conditional Inference Trees
ctree from party package in R
- recursive partitioning just like
rpart
rpart: maximizing an information measure
ctree: significance test procedure
Advantages
Advantages of Trees: straightforward to interpret
Advantages of Trees over linear regression models: very large set of observations can be used & model specification is no longer exogenously given
Advantages of Conditional Inference Trees over Regression and Classification Trees (CART): the algorithm automatically provides a test for the null hypothesis of equality of opportunity & prevents overfitting while CART “cannot distinguish between a significant and an insignificant improvement in the information measure” (Mingers 1987, as cited in @hot, 2) & consider the distributional properties of the measures.
Procedure
The algorithm follows a stepwise procedure [@brunori20, 7-8]:
- Choose confidence level Test the null hypothesis of independence, \(H_0^{C^p} : D(Y|C^P) = D(Y)\), for each input variable \(C^P \in \hat{\Omega}\), and obtain a p-value associated with each test, \(p^{C^p}\). \(\implies\) We adjust the p-values for multiple hypothesis testing, such that \(p_{adj.}^{C^p} = 1-(1-p^{Cp})^P\), which essentially means that we use the so called Bonferroni Correction.
- Choose feature: test all the null hypotheses of independence between the individual outcome and each of all the observable circumstances (variables). The model selects a variable, \(C^*\), with the lowest adjusted p-value. Essentially we choose such that \(C^* = \{C^P : \text{argmin} ~ p_{adj.}^{C^p} \}\).
- no hypothesis can be rejected: stop \(\implies\) If \(p_{adj.}^{C^p} > \alpha\): Exit the algorithm.
- one or more circumstance is siginificant: select the circumstance with the smallest p-value and proceed \(\rightarrow\) If \(p_{adj.}^{C^p} \leq \alpha\): Continue, and select \(C^*\) as the splitting variable.
- Choose split: for every possible way the selected circumstance can divide the sample into two subgroups, test the hypothesis of same mean outcome in the two resulting subgroups. Choose the splitting point with the smallest p-value. Technically, we test the discrepancy between the subsamples for each possible binary partition, s, based on \(C^*\), meaning that \(Y_s = \{Y_i : C^*_i < x^p \}\) and \(Y_{-s} = \{Y_i : C^*_i \geq x^p \}\), and obtain a p-value associated with each test, \(p^{C^*_s}\).
\(\implies\) The the Split sample based on \(C^*_s\), by choosing the split point s that yields the lowest p-value, which is \(C^*_s = \{C^*_s : \text{argmin} ~ p^{C^*_s} \}\). 4. Repeat :)
Regression Trees
Following [@brunori20] we split the data into training and testing data by \(2/3:1/3\). Furthermore, we chose to show the results obtained using regression trees obtained from the ‘rpart’ package. The training and test data sets will be continually used also for further analysis when we proceed with ‘cTree’.
set.seed(123)
AT_equality_data <- AT_equality_data %>%
mutate(train_index = sample(c("train", "test"), nrow(AT_equality_data), replace=TRUE, prob=c(0.67, 0.33)))
AT_train <- AT_equality_data %>% filter(train_index=="train")
AT_test <- AT_equality_data %>% filter(train_index=="test")
formula <- log_income ~ sex + parents_present + adults_home + children_home + father_cob + father_cit + mother_cob + mother_cit + father_edu + mother_edu + father_occup_stat + mother_occup_stat + father_occup + mother_occup + father_manag + mother_manag + tenancy
AT_tree <- rpart(formula, data = AT_train, cp=.008)
AT_tree
## n= 4536
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 4536 6006.255 10.86120
## 2) sex>=1.5 2284 4126.054 10.74883 *
## 3) sex< 1.5 2252 1822.114 10.97516 *
rpart.plot(AT_tree, box.palette="RdBu", nn=FALSE, type=2, main = "Regression Tree for Austria 2011")

FR_equality_data <- FR_equality_data %>%
mutate(train_index = sample(c("train", "test"), nrow(FR_equality_data), replace=TRUE, prob=c(0.67, 0.33)))
FR_train <- FR_equality_data %>% filter(train_index=="train")
FR_test <- FR_equality_data %>% filter(train_index=="test")
FR_tree <- rpart(formula, data = FR_train, cp=.003)
FR_tree
## n= 7348
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 7348 5852.9600 10.72465
## 2) sex>=1.5 3766 3520.8540 10.66340 *
## 3) sex< 1.5 3582 2303.1310 10.78903
## 6) father_edu< 1.5 3090 2068.1160 10.75670 *
## 7) father_edu>=1.5 492 211.4952 10.99211 *
rpart.plot(FR_tree, box.palette="RdBu", nn=FALSE, type=2, main = "Regression Tree for France 2011")

ES_equality_data <- ES_equality_data %>%
mutate(train_index = sample(c("train", "test"), nrow(ES_equality_data), replace=TRUE, prob=c(0.67, 0.33)))
ES_train <- ES_equality_data %>% filter(train_index=="train")
ES_test <- ES_equality_data %>% filter(train_index=="test")
ES_tree <- rpart(formula, data = ES_train, cp=.003)
ES_tree
## n=11504 (60 observations deleted due to missingness)
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 11504 20621.54 10.28816 *
rpart.plot(ES_tree, box.palette="RdBu", nn=FALSE, type=2, main = "Regression Tree for Spain 2011")

DK_equality_data <- DK_equality_data %>%
mutate(train_index = sample(c("train", "test"), nrow(DK_equality_data), replace=TRUE, prob=c(0.67, 0.33)))
DK_train <- DK_equality_data %>% filter(train_index=="train")
DK_test <- DK_equality_data %>% filter(train_index=="test")
DK_tree <- rpart(formula, data = DK_train, cp=.003)
DK_tree
## n=3061 (24 observations deleted due to missingness)
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 3061 1395.174 11.15718 *
rpart.plot(DK_tree, box.palette="RdBu", nn=FALSE, type=2, main = "Regression Tree for Denmark 2011")

IT_equality_data <- IT_equality_data %>%
mutate(train_index = sample(c("train", "test"), nrow(IT_equality_data), replace=TRUE, prob=c(0.67, 0.33)))
IT_train <- IT_equality_data %>% filter(train_index=="train")
IT_test <- IT_equality_data %>% filter(train_index=="test")
IT_tree <- rpart(formula, data = IT_train, cp=.003)
IT_tree
## n= 25540
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 25540 32118.68 16.67441
## 2) sex>=1.5 12576 14422.10 16.59351 *
## 3) sex< 1.5 12964 17534.41 16.75289 *
rpart.plot(IT_tree, box.palette="RdBu", nn=FALSE, type=2, main = "Regression Tree for Italy 2011")

FI_equality_data <- FI_equality_data %>%
mutate(train_index = sample(c("train", "test"), nrow(FI_equality_data), replace=TRUE, prob=c(0.67, 0.33)))
FI_train <- FI_equality_data %>% filter(train_index=="train")
FI_test <- FI_equality_data %>% filter(train_index=="test")
FI_tree <- rpart(formula, data = FI_train, cp=.003)
FI_tree
## n=5612 (4 observations deleted due to missingness)
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 5612 4221.345 10.81804 *
rpart.plot(FI_tree, box.palette="RdBu", nn=FALSE, type=2, main = "Regression Tree for Finland 2011")

LV_equality_data <- LV_equality_data %>%
mutate(train_index = sample(c("train", "test"), nrow(LV_equality_data), replace=TRUE, prob=c(0.67, 0.33)))
LV_train <- LV_equality_data %>% filter(train_index=="train")
LV_test <- LV_equality_data %>% filter(train_index=="test")
LV_tree <- rpart(formula, data = LV_train, cp=.003)
LV_tree
## n= 4851
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 4851 9915.431 9.168963
## 2) mother_occup>=4.5 3640 8286.543 9.068246 *
## 3) mother_occup< 4.5 1211 1480.978 9.471696 *
rpart.plot(LV_tree, box.palette="RdBu", nn=FALSE, type=2, main = "Regression Tree for Latvia 2011")

Conditional Inference Trees
AT_Ctree <- ctree(formula, data = AT_train)
AT_Ctree
##
## Model formula:
## log_income ~ sex + parents_present + adults_home + children_home +
## father_cob + father_cit + mother_cob + mother_cit + father_edu +
## mother_edu + father_occup_stat + mother_occup_stat + father_occup +
## mother_occup + father_manag + mother_manag + tenancy
##
## Fitted party:
## [1] root
## | [2] mother_cob <= 1
## | | [3] sex <= 1
## | | | [4] father_edu <= 1: 10.932 (n = 697, err = 628.9)
## | | | [5] father_edu > 1: 11.051 (n = 1079, err = 631.2)
## | | [6] sex > 1
## | | | [7] mother_cit <= 2: 10.842 (n = 1514, err = 2235.1)
## | | | [8] mother_cit > 2: 10.566 (n = 223, err = 621.0)
## | [9] mother_cob > 1
## | | [10] sex <= 1
## | | | [11] children_home <= 5
## | | | | [12] mother_cob <= 2: 11.046 (n = 204, err = 101.4)
## | | | | [13] mother_cob > 2
## | | | | | [14] father_edu <= 1: 10.706 (n = 105, err = 122.6)
## | | | | | [15] father_edu > 1
## | | | | | | [16] mother_cit <= 2
## | | | | | | | [17] mother_manag <= 1: 10.361 (n = 24, err = 128.7)
## | | | | | | | [18] mother_manag > 1: 10.956 (n = 111, err = 43.0)
## | | | | | | [19] mother_cit > 2: 10.600 (n = 13, err = 9.6)
## | | | [20] children_home > 5: 10.100 (n = 19, err = 115.2)
## | | [21] sex > 1: 10.564 (n = 547, err = 1230.6)
##
## Number of inner nodes: 10
## Number of terminal nodes: 11
plot(AT_Ctree, type = "simple",gp = gpar(fontsize = 6),
inner_panel=node_inner,
ip_args=list(abbreviate = FALSE,id = FALSE), main = "Conditional Inference Tree for Austria 2011")

Cross Validation using the Caret package
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 10, savePredictions = T)
AT_cctree1 <- train(formula, data = AT_train, method = "ctree", trControl = fitControl, na.action = na.pass)
AT_cctree1 #This is the suggested tree we get from applying Caret
## Conditional Inference Tree
##
## 4536 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 4082, 4083, 4084, 4083, 4082, 4083, ...
## Resampling results across tuning parameters:
##
## mincriterion RMSE Rsquared MAE
## 0.01 1.165198 0.007522094 0.7112571
## 0.50 1.145303 0.008005095 0.6889176
## 0.99 1.138132 0.011057196 0.6829340
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mincriterion = 0.99.
AT_cct <- ctree(formula, data = AT_train, mincriterion = 0.99) #Using the suggestion we generate a Conditional Inference Tree and plot it as our final result
plot(AT_cct,gp = gpar(fontsize = 8),
inner_panel=node_inner,
ip_args=list(abbreviate = FALSE,id = FALSE), main = "Opportunity Conditional Inference Tree for Austria 2011 - Cross Validated with Caret")

AT_ctree2 <- ctree(formula, data = AT_equality_data, control = ctree_control(testtype = "Bonferroni", mincriterion = 0.99))
AT_ctree2
##
## Model formula:
## log_income ~ sex + parents_present + adults_home + children_home +
## father_cob + father_cit + mother_cob + mother_cit + father_edu +
## mother_edu + father_occup_stat + mother_occup_stat + father_occup +
## mother_occup + father_manag + mother_manag + tenancy
##
## Fitted party:
## [1] root
## | [2] sex <= 1
## | | [3] mother_cob <= 2
## | | | [4] father_edu <= 0: 10.803 (n = 101, err = 234.8)
## | | | [5] father_edu > 0
## | | | | [6] mother_cit <= 2
## | | | | | [7] father_cit <= 2
## | | | | | | [8] mother_edu <= 1: 11.020 (n = 1301, err = 909.8)
## | | | | | | [9] mother_edu > 1: 11.109 (n = 1011, err = 634.3)
## | | | | | [10] father_cit > 2: 10.925 (n = 260, err = 172.8)
## | | | | [11] mother_cit > 2: 10.891 (n = 278, err = 169.6)
## | | [12] mother_cob > 2
## | | | [13] children_home <= 5: 10.788 (n = 357, err = 339.7)
## | | | [14] children_home > 5: 9.958 (n = 12, err = 113.2)
## | [15] sex > 1
## | | [16] mother_cob <= 1
## | | | [17] mother_cit <= 1
## | | | | [18] father_cit <= 2: 10.867 (n = 1679, err = 2193.3)
## | | | | [19] father_cit > 2: 10.777 (n = 306, err = 409.7)
## | | | [20] mother_cit > 1: 10.742 (n = 545, err = 853.7)
## | | [21] mother_cob > 1: 10.593 (n = 891, err = 2137.9)
##
## Number of inner nodes: 10
## Number of terminal nodes: 11
plot(AT_ctree2, type = "simple",gp = gpar(fontsize = 6),
inner_panel=node_inner,
ip_args=list(abbreviate = FALSE,id = FALSE), main = "Opportunity Conditional Inference Tree for Austria 2011 - Cross Validated with Ctree")

AT_test$P_AtCt <- predict(AT_ctree2, newdata = as.data.frame(AT_test))
AT_test$perror <- (AT_test$P_AtCt - AT_test$log_income)^2
AT_test$RMSE <- sqrt(sum((AT_test$P_AtCt - AT_test$log_income)^2/nrow(AT_test), na.rm = T))
# For Austria we have a RMSE of 1.2, which is not very good. But is most likely attributed to the synthetic data.
# Plot the Errors somehow
FR_Ctree <- ctree(formula, data = FR_train)
FR_Ctree
##
## Model formula:
## log_income ~ sex + parents_present + adults_home + children_home +
## father_cob + father_cit + mother_cob + mother_cit + father_edu +
## mother_edu + father_occup_stat + mother_occup_stat + father_occup +
## mother_occup + father_manag + mother_manag + tenancy
##
## Fitted party:
## [1] root
## | [2] sex <= 1
## | | [3] father_edu <= 1
## | | | [4] mother_cit <= 1: 10.777 (n = 2543, err = 1660.3)
## | | | [5] mother_cit > 1: 10.690 (n = 288, err = 168.2)
## | | [6] father_edu > 1: 10.868 (n = 751, err = 466.8)
## | [7] sex > 1
## | | [8] tenancy <= 1
## | | | [9] mother_cit <= 2: 10.697 (n = 2183, err = 1924.0)
## | | | [10] mother_cit > 2: 10.607 (n = 171, err = 146.2)
## | | [11] tenancy > 1
## | | | [12] father_cit <= 1: 10.647 (n = 1203, err = 1174.1)
## | | | [13] father_cit > 1: 10.452 (n = 209, err = 263.9)
##
## Number of inner nodes: 6
## Number of terminal nodes: 7
plot(FR_Ctree, type = "simple",gp = gpar(fontsize = 6),
inner_panel=node_inner,
ip_args=list(abbreviate = FALSE,id = FALSE), main = "Conditional Inference Tree for France 2011")

FR_cctree <- train(formula, data = FR_train, method = "ctree", trControl = fitControl, na.action = na.pass)
FR_cctree #This is the suggested tree we get from applying Caret
## Conditional Inference Tree
##
## 7348 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 6612, 6615, 6613, 6615, 6613, 6612, ...
## Resampling results across tuning parameters:
##
## mincriterion RMSE Rsquared MAE
## 0.01 0.8969202 0.008025817 0.5931404
## 0.50 0.8893252 0.006947776 0.5815613
## 0.99 0.8872471 0.008028844 0.5785668
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mincriterion = 0.99.
FR_cct <- ctree(formula, data = FR_train, mincriterion = 0.99) #Using the suggestion we generate a Conditional Inference Tree and plot it as our final result
plot(FR_cct, type = "simple",gp = gpar(fontsize = 8),
inner_panel=node_inner,
ip_args=list(abbreviate = FALSE,id = FALSE), main = "Opportunity Conditional Inference Tree for France 2011 - Cross Validated")

FR_test$P_FRCt <- predict(FR_cct, newdata = as.data.frame(FR_test))
FR_test$perror <- (FR_test$P_FRCt - FR_test$log_income)^2
FR_test$RMSE <- sqrt(sum((FR_test$P_FRCt - FR_test$log_income)^2/nrow(FR_test), na.rm = T))
# RMSE 0.8
ES_Ctree <- ctree(formula, data = ES_train)
ES_Ctree
##
## Model formula:
## log_income ~ sex + parents_present + adults_home + children_home +
## father_cob + father_cit + mother_cob + mother_cit + father_edu +
## mother_edu + father_occup_stat + mother_occup_stat + father_occup +
## mother_occup + father_manag + mother_manag + tenancy
##
## Fitted party:
## [1] root
## | [2] mother_cit <= 1
## | | [3] father_cit <= 1
## | | | [4] sex <= 1
## | | | | [5] father_cob <= 1
## | | | | | [6] father_edu <= 0: 10.273 (n = 268, err = NaN)
## | | | | | [7] father_edu > 0: 10.477 (n = 3628, err = NaN)
## | | | | [8] father_cob > 1: 10.189 (n = 638, err = NaN)
## | | | [9] sex > 1
## | | | | [10] father_cob <= 1: 10.293 (n = 3396, err = NaN)
## | | | | [11] father_cob > 1: 10.043 (n = 661, err = NaN)
## | | [12] father_cit > 1: 10.157 (n = 1370, err = NaN)
## | [13] mother_cit > 1: 10.107 (n = 1603, err = NaN)
##
## Number of inner nodes: 6
## Number of terminal nodes: 7
plot(ES_Ctree, type = "simple",gp = gpar(fontsize = 6),
inner_panel=node_inner,
ip_args=list(abbreviate = FALSE,id = FALSE), main = "Conditional Inference Tree for Spain 2011")

ES_cctree <- train(formula, data = ES_train, method = "ctree", trControl = fitControl, na.action = na.omit) #The spanish synthetic dataset has many NA`s, the output of the tree is unreliable as we don't have information on the errors
ES_cctree #This is the suggested tree we get from applying Caret
## Conditional Inference Tree
##
## 11564 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 6, 6, 6, 6, 6, 6, ...
## Resampling results across tuning parameters:
##
## mincriterion RMSE Rsquared MAE
## 0.01 0.6213837 NaN 0.6213837
## 0.50 0.6213837 NaN 0.6213837
## 0.99 0.6213837 NaN 0.6213837
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mincriterion = 0.99.
ES_cct <- ctree(formula, data = ES_train, mincriterion = 0.99) #Using the suggestion we generate a Conditional Inference Tree and plot it as our final result
plot(ES_cct, type = "simple",gp = gpar(fontsize = 8),
inner_panel=node_inner,
ip_args=list(abbreviate = FALSE,id = FALSE), main = "Opportunity Conditional Inference Tree for Spain 2011 - Cross Validated")

IT_Ctree <- ctree(formula, data = IT_train, control = ctree_control())
IT_Ctree
##
## Model formula:
## log_income ~ sex + parents_present + adults_home + children_home +
## father_cob + father_cit + mother_cob + mother_cit + father_edu +
## mother_edu + father_occup_stat + mother_occup_stat + father_occup +
## mother_occup + father_manag + mother_manag + tenancy
##
## Fitted party:
## [1] root
## | [2] sex <= 1: 16.753 (n = 12964, err = 17534.4)
## | [3] sex > 1
## | | [4] father_occup <= 7: 16.584 (n = 9419, err = 10565.6)
## | | [5] father_occup > 7: 16.623 (n = 3157, err = 3852.8)
##
## Number of inner nodes: 2
## Number of terminal nodes: 3
plot(IT_Ctree,gp = gpar(fontsize = 6),
inner_panel=node_inner,
ip_args=list(abbreviate = FALSE,id = FALSE), main = "Conditional Inference Tree for Italy 2011")

IT_cctree <- train(formula, data = IT_train, method = "ctree", trControl = fitControl, na.action = na.pass)
IT_cctree #suggests using mincriterion 0.99
## Conditional Inference Tree
##
## 25540 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 22986, 22987, 22985, 22986, 22985, 22986, ...
## Resampling results across tuning parameters:
##
## mincriterion RMSE Rsquared MAE
## 0.01 1.124577 0.001877554 0.8663355
## 0.50 1.118683 0.005242715 0.8606371
## 0.99 1.118639 0.005288233 0.8601645
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mincriterion = 0.99.
plot(IT_cctree$finalModel)

#plotted as ctree
IT_cct <- ctree(formula, data = IT_train, mincriterion = 0.99)
plot(IT_cct,gp = gpar(fontsize = 8),
inner_panel=node_inner,
ip_args=list(abbreviate = FALSE,id = FALSE), main = "Opportunity Conditional Inference Tree for Italy 2011 - Cross Validated")

#In Italy we have too many NAs among the circumstantial
IT_test$P_Ct <- predict(IT_cct, newdata = as.data.frame(IT_test))
IT_test$perror <- (IT_test$P_Ct - IT_test$log_income)^2
IT_test$RMSE <- sqrt(sum((IT_test$P_Ct - IT_test$log_income)^2/nrow(IT_test), na.rm = T))
# The Denmark set has too many missing values, we cannot evaluate it with the given variables
DK_cctree <- train(formula, data = DK_train, method = "ctree", trControl = fitControl, na.action = na.omit)
## Error: Every row has at least one missing value were found
DK_cctree
## Error in eval(expr, envir, enclos): Objekt 'DK_cctree' nicht gefunden
# The Finland set has too many missing values
FI_cctree <- train(formula, data = FI_train, method = "ctree", trControl = fitControl, na.action = na.omit)
## Error: Every row has at least one missing value were found
FI_cctree
## Error in eval(expr, envir, enclos): Objekt 'FI_cctree' nicht gefunden
LV_Ctree <- ctree(formula, data = LV_train)
LV_Ctree
##
## Model formula:
## log_income ~ sex + parents_present + adults_home + children_home +
## father_cob + father_cit + mother_cob + mother_cit + father_edu +
## mother_edu + father_occup_stat + mother_occup_stat + father_occup +
## mother_occup + father_manag + mother_manag + tenancy
##
## Fitted party:
## [1] root
## | [2] mother_occup <= 4
## | | [3] mother_edu <= 2: 9.210 (n = 1849, err = 3313.9)
## | | [4] mother_edu > 2: 9.387 (n = 369, err = 638.9)
## | [5] mother_occup > 4
## | | [6] mother_edu <= 1: 9.060 (n = 1099, err = 2396.2)
## | | [7] mother_edu > 1
## | | | [8] sex <= 1: 9.190 (n = 712, err = 1905.0)
## | | | [9] sex > 1: 9.106 (n = 822, err = 1624.1)
##
## Number of inner nodes: 4
## Number of terminal nodes: 5
plot(LV_Ctree,gp = gpar(fontsize = 8),
inner_panel=node_inner,
ip_args=list(abbreviate = FALSE,id = FALSE), main = "Conditional Inference Tree for Latvia 2011")

LV_cctree <- train(formula, data = LV_train, method = "ctree", trControl = fitControl, na.action = na.pass)
LV_cctree #again we choose Mincriterion 0.99 based on the RMSE
## Conditional Inference Tree
##
## 4851 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 4366, 4366, 4366, 4366, 4364, 4366, ...
## Resampling results across tuning parameters:
##
## mincriterion RMSE Rsquared MAE
## 0.01 1.442565 0.006763641 0.8731092
## 0.50 1.424655 0.008741515 0.8512662
## 0.99 1.419613 0.009915048 0.8463406
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mincriterion = 0.99.
plot(LV_cctree$finalModel)

# we do the control step using the default ctree_control function
LV_cct <- ctree(formula, data = LV_train, control = ctree_control(testtype = "Bonferroni", mincriterion = 0.95))
plot(LV_cct,gp = gpar(fontsize = 8),
inner_panel=node_inner,
ip_args=list(abbreviate = FALSE,id = FALSE), main = "Conditional Inference Tree for Latvia 2011 - Cross Validated")

LV_test$P_Ct <- predict(LV_cct, newdata = as.data.frame(LV_test))
LV_test$perror <- (LV_test$P_Ct - LV_test$log_income)^2
LV_test$RMSE <- sqrt(sum((LV_test$P_Ct - LV_test$log_income)^2/nrow(LV_test), na.rm = T))
#RMSE of 1.4 which is not so good, and does not speak of good predictive capabilities of the model
Conditional Forest
AT_cf <- cforest(formula, AT_equality_data, na.action = na.pass, control = ctree_control(teststat = "quadratic", testtype = "Bonferroni", mincriterion = 0.99), ytrafo = NULL, scores = NULL, ntree = 500L, perturb = list(replace = FALSE, fraction = 0.8))
AThat_cf <- predict(AT_cf, newdata = AT_test, OOB = TRUE, type = "response")
varimp(AT_cf, mincriterion = 0, OOB = TRUE)
## sex parents_present adults_home children_home
## 0.0188871224 -0.0009683862 0.0030389690 0.0008731912
## father_cob father_cit mother_cob mother_cit
## 0.0038475944 0.0039683794 0.0086866878 0.0053998440
## father_edu mother_edu mother_occup_stat father_occup
## 0.0007573548 0.0031400169 -0.0072261529 -0.0001711782
## father_manag mother_manag
## -0.0086178882 0.0060585751
importance_cf <- data.frame(varimp(AT_cf, mincriterion = 0, OOB = TRUE))
names(importance_cf) <- "importance"
importance_cf$var_name = rownames(importance_cf)
importance_cf <- importance_cf %>% arrange( desc(importance))
ggplot(importance_cf, aes(x = var_name, y = importance)) +
geom_point() +
scale_x_discrete(limits = importance_cf$var_name[order(importance_cf$importance)]) +
labs(title = "Conditional Forest variable importance - Austria 2011", x = "", y = "Mean decrease in sum of squared residuals") +
coord_flip() +
theme(axis.text.y = element_text(hjust = 0))

FR_cf <- cforest(formula, FR_equality_data, na.action = na.pass, control = ctree_control(teststat = "quadratic", testtype = "Bonferroni", mincriterion = 0.99), ytrafo = NULL, scores = NULL, ntree = 500L, perturb = list(replace = FALSE, fraction = 0.8))
importance_cf_FR <- data.frame(varimp(FR_cf, mincriterion = 0, OOB = TRUE))
names(importance_cf_FR) <- "importance"
importance_cf_FR$var_name = rownames(importance_cf_FR)
importance_cf_FR <- importance_cf_FR %>% arrange(desc(importance))
IT_cf <- cforest(formula, IT_equality_data, na.action = na.pass, control = ctree_control(teststat = "quadratic", testtype = "Bonferroni", mincriterion = 0.99), ytrafo = NULL, scores = NULL, ntree = 500L, perturb = list(replace = FALSE, fraction = 0.8))
importance_cf_IT <- data.frame(varimp(IT_cf, mincriterion = 0, OOB = TRUE))
names(importance_cf_IT) <- "importance"
importance_cf_IT$var_name = rownames(importance_cf_IT)
importance_cf_IT <- importance_cf_IT %>% arrange(desc(importance))
LV_cf <- cforest(formula, LV_equality_data, na.action = na.pass, control = ctree_control(teststat = "quadratic", testtype = "Bonferroni", mincriterion = 0.99), ytrafo = NULL, scores = NULL, ntree = 500L, perturb = list(replace = FALSE, fraction = 0.8))
importance_cf_LV <- data.frame(varimp(LV_cf, mincriterion = 0, OOB = TRUE))
names(importance_cf_LV) <- "importance"
importance_cf_LV$var_name = rownames(importance_cf_LV)
importance_cf_LV <- importance_cf_LV %>% arrange(desc(importance))
ggplot(importance_cf_FR, aes(x = var_name, y = importance)) +
geom_point() +
scale_x_discrete(limits = importance_cf_FR$var_name[order(importance_cf_FR$importance)]) +
labs(title = "Conditional Forest variable importance - France 2011", x = "", y = "Mean decrease in sum of squared residuals") +
coord_flip() +
theme(axis.text.y = element_text(hjust = 0))

ggplot(importance_cf_IT, aes(x = var_name, y = importance)) +
geom_point() +
scale_x_discrete(limits = importance_cf_IT$var_name[order(importance_cf_IT$importance)]) +
labs(title = "Conditional Forest variable importance - Italy 2011", x = "", y = "Mean decrease in sum of squared residuals") +
coord_flip() +
theme(axis.text.y = element_text(hjust = 0))

ggplot(importance_cf_LV, aes(x = var_name, y = importance)) +
geom_point() +
scale_x_discrete(limits = importance_cf_LV$var_name[order(importance_cf_LV$importance)]) +
labs(title = "Conditional Forest variable importance - Latvia 2011", x = "", y = "Mean decrease in sum of squared residuals") +
coord_flip() +
theme(axis.text.y = element_text(hjust = 0))

# Conclusion
Conclusion
References